xxxxxxxxxximport numpy as np import pandas as pdimport osfor dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))xxxxxxxxxx# I imported following python libraries to utilize in EDA process.import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport plotly.express as pximport plotly.graph_objs as goimport plotlyimport plotly.graph_objects as goimport datetimeCabData = pd.read_csv('Cab_Data.csv')missing_value = 0for column in CabData.columns : missing_value += CabData[column].isna().sum() + CabData[column].isnull().sum()print('CabData dataset has {} entries , {} features and {} missing values'.format(CabData.shape[0] ,CabData.shape[1] ,missing_value))print("\nFeature's datatypes\n\n{}".format(CabData.dtypes))CabData.head(10)xxxxxxxxxxCity = pd.read_csv('City.csv')missing_value = 0for column in City.columns : missing_value += City[column].isna().sum() + City[column].isnull().sum()print('City dataset has {} entries , {} features and {} missing values'.format(City.shape[0] ,City.shape[1] ,missing_value))print("\nFeature's datatypes\n\n{}".format(City.dtypes))City.head(20)xxxxxxxxxxCustomerID = pd.read_csv('Customer_ID.csv')missing_value = 0for column in CustomerID.columns : missing_value += CustomerID[column].isna().sum() + CustomerID[column].isnull().sum()print('CustomerID dataset has {} entries , {} features and {} missing values'.format(CustomerID.shape[0] ,CustomerID.shape[1] ,missing_value))print("\nFeature's datatypes\n\n{}".format(CustomerID.dtypes))CustomerID.head(10)xxxxxxxxxxTransactionID = pd.read_csv('Transaction_ID.csv')missing_value = 0for column in TransactionID.columns : missing_value += TransactionID[column].isna().sum() + TransactionID[column].isnull().sum()print('TransactionID dataset has {} entries , {} features and {} missing values'.format(TransactionID.shape[0] ,TransactionID.shape[1] ,missing_value))print("\nFeature's datatypes\n\n{}".format(TransactionID.dtypes))TransactionID.head(10)xxxxxxxxxxMasterData = CabData.merge(TransactionID, on= 'Transaction ID').merge(CustomerID, on ='Customer ID').merge(City, on = 'City')missing_value = 0for column in MasterData.columns : missing_value += MasterData[column].isna().sum() + MasterData[column].isnull().sum()print('MasterData dataset has {} entries , {} features and {} missing values'.format(MasterData.shape[0] ,MasterData.shape[1] ,missing_value))print("\nFeature's datatypes\n\n{}".format(MasterData.dtypes))MasterData.head(10)xxxxxxxxxx#removing ',' in population and users column values#replacing spaces with '_' in column names for column in MasterData.columns: if ' ' in column: MasterData = MasterData.rename(columns={column:column.replace(' ','_')}) for column in ["Population","Users"] : MasterData[column] = MasterData[column].str.replace(',','') MasterData.head(10)xxxxxxxxxx# Converted some columns from object to category or int64 datatypes to work with appropriate ones.# Converted Data of Travel column to datetime object.MasterData['Date_of_Travel'] = pd.to_datetime(MasterData['Date_of_Travel'])for column in ["Company", "City" , "Payment_Mode" , "Gender" ] : MasterData[column] = MasterData[column].astype('category') for column in ["Population", "Users" ] : MasterData[column] = MasterData[column].astype('int64') print("\nFeature's datatypes\n\n{}".format(MasterData.dtypes))xxxxxxxxxx# Here I splitted MasterData into two dataset for every Cab Firm to analyze them individually.Pink_MasterData = MasterData [MasterData['Company'] == 'Pink Cab'] Yellow_MasterData = MasterData [MasterData['Company'] == 'Yellow Cab']selected_columns = ['KM_Travelled','Price_Charged','Cost_of_Trip', 'Age' ,'Income_(USD/Month)', 'Population' , 'Users']xxxxxxxxxx# Getting the statinfo's of selected columns from MasterData for Pink Cab Firmfiltered_Pink_MasterData = Pink_MasterData[selected_columns]Pink_Cab_StatsInfo = filtered_Pink_MasterData.describe()Pink_Cab_StatsInfoxxxxxxxxxx# Getting the statinfo's of selected columns from MasterData for Pink Cab Firmfiltered_Yellow_MasterData = Yellow_MasterData[selected_columns]Yellow_Cab_StatsInfo = filtered_Yellow_MasterData.describe()Yellow_Cab_StatsInfo#The KDE curves and distribtion plots of selected variables with respect to Cab Firms drawn below.fig,axes = plt.subplots(1, 3 , figsize=(26,8), sharey=True)fig.suptitle('Distributions of Variables')sns.histplot(ax=axes[0], data=MasterData , x='Cost_of_Trip' , kde = True , hue="Company")sns.histplot(ax=axes[1], data=MasterData , x='Price_Charged' , kde = True , hue="Company")sns.histplot(ax=axes[2], data=MasterData , x='Income_(USD/Month)' , kde = True , hue="Company")xxxxxxxxxx#The KDE curves and distribtion plots of selected variables with respect to Cab Firms drawn below.fig,axes = plt.subplots(1, 2 ,figsize=(22,9), sharey=True)fig.suptitle('Distributions of Variables')sns.histplot(ax=axes[0],data=MasterData , x='Age' , kde = True , hue="Company")sns.histplot(ax=axes[1], data=MasterData , x='KM_Travelled' , kde = True , hue="Company")xxxxxxxxxx#The KDE curves and distribtion plots of selected variables with respect to Cab Firms drawn below.fig,axes = plt.subplots(1, 2 , figsize=(22 ,9), sharey=True)fig.suptitle('Distributions of Variables')sns.histplot(ax=axes[0], data=MasterData , x='Population' , kde = True , hue="Company")sns.histplot(ax=axes[1], data=MasterData , x='Users' , kde = True , hue="Company")xxxxxxxxxx#Outlier points and lower-upper limits were analyzed by IQR method by filtering the data from the statistical value table of the desired variables.def showIQRstats(dataFrame_statsinfo,column_name): mean = dataFrame_statsinfo[column_name][1] median = dataFrame_statsinfo[column_name][5] median_mean_ratio = median/mean Q1 = dataFrame_statsinfo[column_name][4] Q3 = dataFrame_statsinfo[column_name][6] IQR = Q3-Q1 upper_band = Q3 + 1.5*IQR lower_band = Q1 - 1.5*IQR print('\n\n' + "mean of : " + str(column_name) + " is " + str(mean) + '\n' + "median of : " + str(column_name) + " is " + str(median) + '\n' + "median/mean ratio of : " + str(column_name) + " is " + str(median_mean_ratio) + '\n' + "Q1 value of : " + str(column_name) + " is " + str(Q1) + '\n' + "Q3 value of : " + str(column_name) + " is " + str(Q3) + '\n' + "IQR value of : " + str(column_name) + " is " + str(IQR) + '\n' + "Upper and Lower Limits of " + str(column_name) + " is " + str((lower_band,upper_band)) ) return upper_band , lower_bandxxxxxxxxxxprint('Statistical infos of Pink Cab Firm :')for column_name in selected_columns : upper_band , lower_band = showIQRstats(Pink_Cab_StatsInfo , column_name) outliers = filtered_Pink_MasterData[(filtered_Pink_MasterData[column_name] < lower_band) | (filtered_Pink_MasterData[column_name] > upper_band)][column_name].values print('\n' + str(column_name) + " has " + str(len(outliers)) + " outliers : " + str(outliers)) print('\n\n**********************************************************\n')xxxxxxxxxxprint('Statistical infos of Yellow Cab Firm :')for column_name in selected_columns : upper_band , lower_band = showIQRstats(Yellow_Cab_StatsInfo , column_name) outliers = filtered_Yellow_MasterData[(filtered_Yellow_MasterData[column_name] < lower_band) | (filtered_Yellow_MasterData[column_name] > upper_band)][column_name].values print('\n' + str(column_name) + " has " + str(len(outliers)) + " outliers : " + str(outliers)) print('\n\n**********************************************************\n')xxxxxxxxxx#Boxplot distributions of the desired variables were drawn.fig,axes = plt.subplots(2, 3, figsize=(25, 8), sharey=True)fig.suptitle('Boxplot Distributions of the Variables')sns.boxplot(ax=axes[0,0], x='KM_Travelled' , data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")sns.boxplot(ax=axes[0,1], x='Price_Charged' , data=filtered_Pink_MasterData ) .set_ylabel("Pink Cab")sns.boxplot(ax=axes[0,2], x='Cost_of_Trip' , data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")sns.boxplot(ax=axes[1,0], x='KM_Travelled' , data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")sns.boxplot(ax=axes[1,1], x='Price_Charged' , data=filtered_Yellow_MasterData ) .set_ylabel("Yellow Cab")sns.boxplot(ax=axes[1,2], x='Cost_of_Trip' , data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")xxxxxxxxxx#Boxplot distributions of the desired variables were drawn.fig,axes = plt.subplots(2, 3, figsize=(25, 8), sharey=True)fig.suptitle('Boxplot Distributions of the Variables')sns.boxplot(ax=axes[0,0], x='Age' , data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")sns.boxplot(ax=axes[0,1], x='Income_(USD/Month)' , data=filtered_Pink_MasterData ) .set_ylabel("Pink Cab")sns.boxplot(ax=axes[0,2], x='Population' , data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")sns.boxplot(ax=axes[1,0], x='Age' , data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")sns.boxplot(ax=axes[1,1], x='Income_(USD/Month)' , data=filtered_Yellow_MasterData ) .set_ylabel("Yellow Cab")sns.boxplot(ax=axes[1,2], x='Population' , data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")xxxxxxxxxx#Boxplot distributions of the desired variables were drawn.fig,axes = plt.subplots(1, 2, figsize=(25, 8), sharey=True)sns.boxplot(ax=axes[0], x='Users' , data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")sns.boxplot(ax=axes[1], x='Users' , data=filtered_Yellow_MasterData ) .set_ylabel("Yellow Cab")xxxxxxxxxx# Here visualized the correlation of MasterData features by heatmap for Pink Cab Firm.Pink_Cab_Corr = filtered_Pink_MasterData.corr()plt.figure(figsize=(20,10))sns.heatmap(Pink_Cab_Corr,annot=True,vmin=-1,vmax=1,cmap='coolwarm')xxxxxxxxxx# Here visualized the correlation of MasterData features by heatmap for Pink Cab Firm.Yellow_Cab_Corr = filtered_Yellow_MasterData.corr()plt.figure(figsize=(20,10))sns.heatmap(Yellow_Cab_Corr,annot=True,vmin=-1,vmax=1,cmap='coolwarm')xxxxxxxxxx# Here visualized scatter plots of MasterData features to see correlations between them if existsfig,axes = plt.subplots(1, 2, figsize=(25, 8), sharey=True)fig.suptitle('Boxplot Distributions')sns.scatterplot(ax=axes[0],data=MasterData, x='KM_Travelled', y='Price_Charged' , hue="Company" ).set_title("KM_Travelled - Price_Charged")sns.scatterplot(ax=axes[1],data=MasterData, x='KM_Travelled', y='Cost_of_Trip' , hue="Company").set_title("KM_Travelled - Cost_of_Trip")xxxxxxxxxx# Here visualized scatter plots of MasterData features to see correlations between them if existsfig,axes = plt.subplots(figsize=(16, 6), sharey=True)fig.suptitle('Boxplot Distributions')sns.scatterplot(data=MasterData, x='Price_Charged', y='Cost_of_Trip' , hue="Company").set_title("Price_Charged - Cost_of_Trip")xxxxxxxxxx# Here visualized scatter plots of MasterData features to see correlations between them if existsfig,axes = plt.subplots(figsize=(16, 6), sharey=True)fig.suptitle('Boxplot Distributions')sns.scatterplot(data=MasterData, x='Users', y='Population' , hue="Company" ).set_title("Population - Users")# Here visualized scatter plots of MasterData features to see correlations between them if existsfig,axes = plt.subplots(1, 2, figsize=(25, 8), sharey=True)fig.suptitle('Pink Cab Firm Boxplot Distributions')sns.scatterplot(ax=axes[0],data=MasterData, x='Population', y='Price_Charged' , hue="Company").set_title("Population - Price_Charged")sns.scatterplot(ax=axes[1],data=MasterData, x='Users', y='Price_Charged' , hue="Company").set_title("Users - Price_Charged")xxxxxxxxxx# Obtained covariance values for given features.cov1 = np.cov(MasterData['KM_Travelled'],MasterData['Price_Charged'])[0][1]cov2 = np.cov(MasterData['KM_Travelled'],MasterData['Cost_of_Trip'])[0][1]cov3 = np.cov(MasterData['Price_Charged'],MasterData['Cost_of_Trip'])[0][1]cov4 = np.cov(MasterData['Users'],MasterData['Population'])[0][1]cov5 = np.cov(MasterData['Population'],MasterData['Price_Charged'])[0][1]cov6 = np.cov(MasterData['Users'],MasterData['Price_Charged'])[0][1]print('Covariance for KM_Travelled - Price_Charged : '+ str(cov1))print('Covariance for KM_Travelled - Cost_of_Trip : '+ str(cov2))print('Covariance for Price_Charged - Cost_of_Trip : '+ str(cov3))print('Covariance for Users - Population : '+ str(cov4))print('Covariance for Population - Price_Charged : '+ str(cov5))print('Covariance for Users - Price_Charged : '+ str(cov6))xxxxxxxxxx# Obtained pearson correlation coefficients for given features.pcorr_coef1 , _ = stats.pearsonr(MasterData['KM_Travelled'],MasterData['Price_Charged'])pcorr_coef2 , _ = stats.pearsonr(MasterData['KM_Travelled'],MasterData['Cost_of_Trip'])pcorr_coef3 , _ = stats.pearsonr(MasterData['Price_Charged'],MasterData['Cost_of_Trip'])pcorr_coef4 , _ = stats.pearsonr(MasterData['Users'],MasterData['Population'])pcorr_coef5 , _ = stats.pearsonr(MasterData['Population'],MasterData['Price_Charged'])pcorr_coef6 , _ = stats.pearsonr(MasterData['Users'],MasterData['Price_Charged'])print('pearson correlation coefficient for KM_Travelled - Price_Charged : '+ str(pcorr_coef1))print('pearson correlation coefficient for KM_Travelled - Cost_of_Trip : '+ str(pcorr_coef2))print('pearson correlation coefficient for Price_Charged - Cost_of_Trip : '+ str(pcorr_coef3))print('pearson correlation coefficient for Users - Population : '+ str(pcorr_coef4))print('pearson correlation coefficient for Population - Price_Charged : '+ str(pcorr_coef5))print('pearson correlation coefficient for Users - Price_Charged : '+ str(pcorr_coef6))xxxxxxxxxx# Obtained spearman rank correlations and p - values for given features. spearman_rank_coeff1 = stats.spearmanr(MasterData['KM_Travelled'],MasterData['Price_Charged'])spearman_rank_coeff2 = stats.spearmanr(MasterData['KM_Travelled'],MasterData['Cost_of_Trip'])spearman_rank_coeff3 = stats.spearmanr(MasterData['Price_Charged'],MasterData['Cost_of_Trip'])spearman_rank_coeff4 = stats.spearmanr(MasterData['Users'],MasterData['Population'])spearman_rank_coeff5 = stats.spearmanr(MasterData['Population'],MasterData['Price_Charged'])spearman_rank_coeff6 = stats.spearmanr(MasterData['Users'],MasterData['Price_Charged'])print('spearman rank coefficient coefficient for KM_Travelled - Price_Charged : '+ str(spearman_rank_coeff1))print('spearman rank coefficient coefficient for KM_Travelled - Cost_of_Trip : '+ str(spearman_rank_coeff2))print('spearman rank coefficient coefficient for Price_Charged - Cost_of_Trip : '+ str(spearman_rank_coeff3))print('spearman rank coefficient coefficient for Users - Population : '+ str(spearman_rank_coeff4))print('spearman rank coefficient coefficient for Population - Price_Charged : '+ str(spearman_rank_coeff5))print('spearman rank coefficient coefficient for Users - Price_Charged : '+ str(spearman_rank_coeff6))xxxxxxxxxxdata0 = MasterData.groupby("Company").count()fig1 = px.pie(data0, values=data0.Users, names=data0.index ,title="Pink & Yellow Cab Firm Total Users Overview" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxdata1_0 = MasterData[MasterData["Company"] == "Pink Cab"].groupby("City").count()data1_1 = MasterData[MasterData["Company"] == "Yellow Cab"].groupby("City").count()fig = go.Figure()fig.add_trace(go.Bar( x=data1_0.index, y=data1_0['Users'], name='Pink Cab', marker_color='indianred' )) fig.add_trace(go.Bar( x=data1_1.index, y=data1_1['Users'], name='Yellow Cab', marker_color='blue' ))fig.update_layout( yaxis_title="Users", title="Pink & Yellow Cab Firm Users Distribution Over City" )xxxxxxxxxxdata2_0 = MasterData.groupby("City").count()fig1 = px.pie(data2_0, values=data2_0.Users, names=data2_0.index ,title="Total Users Overview by Cities" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxMasterData["User_Pop_Ratio"] = MasterData["Users"] / MasterData["Population"]MasterData["Profit_of_Trip"] = MasterData["Price_Charged"] - MasterData["Cost_of_Trip"]data2_1 = MasterData.groupby("City").sum()fig1 = px.pie(data2_1, values=data2_1.Profit_of_Trip, names=data2_1.index ,title="Total Market Profit Share by Cities" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxpinkData = MasterData[MasterData['Company'] == 'Pink Cab']yellowData = MasterData[MasterData['Company'] == 'Yellow Cab']dict1 = {"Pink Cab" : [pinkData["Profit_of_Trip"].sum()] ,"Yellow Cab" : [yellowData["Profit_of_Trip"].sum()] }data2_2 = pd.DataFrame(dict1).Tfig1 = px.pie(data2_2, values=data2_2[0], names=data2_2.index ,title="Total Market Profit Share by Cab Firms" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxdata3_0 = MasterData.groupby("Gender").count()fig1 = px.pie(data3_0, values=data3_0.Users, names=data3_0.index ,title="Total Users Overview by Gender" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxdata3_1 = MasterData[MasterData["Company"] == "Pink Cab"].groupby("Gender").count()data3_2 = MasterData[MasterData["Company"] == "Yellow Cab"].groupby("Gender").count()fig = go.Figure()fig.add_trace(go.Bar( x=data3_1.index, y=data3_1['Users'], name='Pink Cab', marker_color='pink' )) fig.add_trace(go.Bar( x=data3_2.index, y=data3_2['Users'], name='Yellow Cab', marker_color='orange' ))fig.update_layout( yaxis_title="Users", title="Pink & Yellow Cab Firm Users Distribution Over Gender" )xxxxxxxxxxdata3_3 = MasterData.groupby("Payment_Mode").count()fig1 = px.pie(data3_3, values=data3_3.Users, names=data3_3.index ,title="Total Users Overview by Payment Method" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxageGroup_Young = MasterData[(MasterData ["Age"] >= 18) & (MasterData ["Age"] < 25)].count()ageGroup_Middle = MasterData[(MasterData ["Age"] >= 25) & (MasterData ["Age"] < 40)].count()ageGroup_Old = MasterData[(MasterData ["Age"] >= 40) & (MasterData ["Age"] <= 65)].count()dict = {"25 > Age >= 18 (YOUNG)" : ageGroup_Young ,"40 > Age >= 25 (MIDDLE)" : ageGroup_Middle , "65 >= Age >= 40 (OLD)" : ageGroup_Old }data4_0 = pd.DataFrame(dict).Tfig1 = px.pie(data4_0 , values=data4_0.Users, names=data4_0.index ,title="Total Users Overview by Age Groups" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxageGroup_Young_Pink = MasterData[(MasterData ["Age"] >= 18) & (MasterData ["Age"] < 25) & (MasterData["Company"] == "Pink Cab")] .count()ageGroup_Middle_Pink = MasterData[(MasterData ["Age"] >= 25) & (MasterData ["Age"] < 40) & (MasterData["Company"] == "Pink Cab")].count()ageGroup_Old_Pink = MasterData[(MasterData ["Age"] >= 40) & (MasterData ["Age"] <= 65) & (MasterData["Company"] == "Pink Cab")].count()ageGroup_Young_Yellow = MasterData[(MasterData ["Age"] >= 18) & (MasterData ["Age"] < 25) & (MasterData["Company"] == "Yellow Cab")] .count()ageGroup_Middle_Yellow = MasterData[(MasterData ["Age"] >= 25) & (MasterData ["Age"] < 40) & (MasterData["Company"] == "Yellow Cab")].count()ageGroup_Old_Yellow = MasterData[(MasterData ["Age"] >= 40) & (MasterData ["Age"] <= 65) & (MasterData["Company"] == "Yellow Cab")].count()dict0 = { "25 > Age >= 18 (YOUNG)" : ageGroup_Young_Pink ,"40 > Age >= 25 (MIDDLE)" : ageGroup_Middle_Pink , "65 >= Age >= 40 (OLD)" : ageGroup_Old_Pink }dict1 = { "25 > Age >= 18 (YOUNG)" : ageGroup_Young_Yellow ,"40 > Age >= 25 (MIDDLE)" : ageGroup_Middle_Yellow , "65 >= Age >= 40 (OLD)" : ageGroup_Old_Yellow }data4_1 = pd.DataFrame(dict0).Tdata4_2 = pd.DataFrame(dict1).Tfig = go.Figure()fig.add_trace(go.Bar( x=data4_1.index, y=data4_1['Users'], name='Pink Cab', marker_color='pink' )) fig.add_trace(go.Bar( x=data4_2.index, y=data4_2['Users'], name='Yellow Cab', marker_color='orange' ))fig.update_layout( yaxis_title="Users", title="Pink & Yellow Cab Firm Users Distributions by Age Groups" )xxxxxxxxxxdata5_0 = MasterData.groupby("City")["Income_(USD/Month)"].mean()data5_0 = pd.DataFrame(data5_0)fig1 = px.pie(data5_0, values=data5_0["Income_(USD/Month)"], names=data5_0.index ,title="Average Income by Cities" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxdata5_1 = MasterData.groupby("Company")["Income_(USD/Month)"].mean()data5_1 = pd.DataFrame(data5_1)fig1 = px.pie(data5_1, values=data5_1["Income_(USD/Month)"], names=data5_1.index ,title="Average Income by Cab Firm" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )data5_2 = MasterData.groupby("City")["KM_Travelled"].sum()data5_2 = pd.DataFrame(data5_2)fig1 = px.pie(data5_2, values=data5_2["KM_Travelled"], names=data5_2.index ,title="Total KM Travelled by Cities" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxdata5_3 = MasterData.groupby("Company")["KM_Travelled"].sum()data5_3 = pd.DataFrame(data5_3)fig1 = px.pie(data5_3, values=data5_3["KM_Travelled"], names=data5_3.index ,title="Total KM Travelled by Cab Firm" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxMasterData["Profit_per_KM"] = MasterData["Profit_of_Trip"].sum() / MasterData["KM_Travelled"].sum()data5_4 = MasterData.groupby("City")["Profit_per_KM"].mean()data5_4 = pd.DataFrame(data5_4)fig1 = px.pie(data5_4, values=data5_4["Profit_per_KM"], names=data5_4.index ,title="Average Profit per KM Travelled by Cities" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxdata5_5 = MasterData.groupby("Company")["Profit_per_KM"].mean()data5_5 = pd.DataFrame(data5_5)fig1 = px.pie(data5_5, values=data5_5["Profit_per_KM"], names=data5_5.index ,title="Average Profit per KM Travelled by Cab Firm" )fig1.update_traces( textposition='inside', textinfo='percent+label' )fig1.update_layout( width = 1100 , height = 600 )xxxxxxxxxxMasterData['Year_of_Travel'] = MasterData['Date_of_Travel'].dt.yearMasterData['Month_of_Travel'] = MasterData['Date_of_Travel'].dt.monthMasterData['Day_of_Travel'] = MasterData['Date_of_Travel'].dt.dayMasterData_TimeSeries=MasterData.set_index('Date_of_Travel')MasterData_TimeSeries.head(10)years_list = ["2016","2017","2018"]def year_pink(): profit_2016 = round ( MasterData[(MasterData['Year_of_Travel'] == 2016) & (MasterData.Company == "Pink Cab")]["Profit_of_Trip"].sum() ,2) profit_2017 = round ( MasterData[(MasterData['Year_of_Travel'] == 2017) & (MasterData.Company == "Pink Cab")]["Profit_of_Trip"].sum() ,2) profit_2018 = round ( MasterData[(MasterData['Year_of_Travel'] == 2018) & (MasterData.Company == "Pink Cab")]["Profit_of_Trip"].sum() ,2) profits = [profit_2016, profit_2017, profit_2018] return profitsdef year_yellow(): profit_2016 = round ( MasterData[(MasterData['Year_of_Travel'] == 2016) & (MasterData.Company == "Yellow Cab")]["Profit_of_Trip"].sum() ,2) profit_2017 = round ( MasterData[(MasterData['Year_of_Travel'] == 2017) & (MasterData.Company == "Yellow Cab")]["Profit_of_Trip"].sum() ,2) profit_2018 = round ( MasterData[(MasterData['Year_of_Travel'] == 2018) & (MasterData.Company == "Yellow Cab")]["Profit_of_Trip"].sum() ,2) profits = [profit_2016, profit_2017, profit_2018] return profitsyellow = year_yellow()pink = year_pink()zippedy = zip(yellow,years_list)zippedp = zip(pink,years_list)datap = pd.DataFrame(zippedp,columns=["Profit","Year"])datay = pd.DataFrame(zippedy,columns=["Profit","Year"])fig = go.Figure()fig.add_trace(go.Scatter(x=datay.Year, y=datay.Profit, mode='lines+markers', name='Yellow Cab' ))fig.add_trace(go.Scatter(x=datap.Year, y=datap.Profit, mode='lines+markers', name='Pink Cab' ))fig.update_layout( title="Total Profit per year by Cab Firm", xaxis_title="Years", yaxis_title="Profits", legend_title="Cab Companies",)fig.show()xxxxxxxxxxMasterData.info()